Script to reproduce years based on a model trained with a specific year¶

Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import salishsea_tools.viz_tools as sa_vi

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm.auto import tqdm
import random

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)'])])
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)
    X_train, _, y_train, _ = train_test_split(inputs2, targets)

    model = MLPRegressor(hidden_layer_sizes=200)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train)

    return (regr)

Regressor for Other Years (Annually)¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor for Other Years (Daily)¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor for Individual Days¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The root mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting (variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training with the Selected Year¶

In [ ]:
# Dataset and date
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.Diatom.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

# just an example
year = 2007
    
dataset = ds.sel(time_counter=str(year))

drivers, diat, _ = datasets_preparation(dataset)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
The amount of data points is 70794
The slope of the best fitting line is  0.689
The correlation coefficient is: 0.847
 The root mean square error is: 0.08492132
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.579
The correlation coefficient is: 0.638
 The root mean square error is: 0.12018933
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.342
The correlation coefficient is: 0.545
 The root mean square error is: 0.17375511
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.52
The correlation coefficient is: 0.528
 The root mean square error is: 0.14184783
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.491
The correlation coefficient is: 0.56
 The root mean square error is: 0.14103295
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.497
The correlation coefficient is: 0.66
 The root mean square error is: 0.12135153
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.393
The correlation coefficient is: 0.593
 The root mean square error is: 0.14722152
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.576
The correlation coefficient is: 0.6
 The root mean square error is: 0.12953593
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.246
The correlation coefficient is: 0.246
 The root mean square error is: 0.1923002
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.397
The correlation coefficient is: 0.437
 The root mean square error is: 0.18143404
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.653
The correlation coefficient is: 0.635
 The root mean square error is: 0.11675781
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.189
The correlation coefficient is: 0.223
 The root mean square error is: 0.18490097
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.248
The correlation coefficient is: 0.339
 The root mean square error is: 0.18264662
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.227
The correlation coefficient is: 0.335
 The root mean square error is: 0.21011113
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.502
The correlation coefficient is: 0.62
 The root mean square error is: 0.14087324
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.47
The correlation coefficient is: 0.544
 The root mean square error is: 0.13067208
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.408
The correlation coefficient is: 0.522
 The root mean square error is: 0.14933771
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  0%|          | 0/640 [00:00<?, ?it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset)

    diat_i = dataset['Diatom']

    regressor4(drivers, diat, 'Diatom ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 1863
The slope of the best fitting line is  0.868
The correlation coefficient is: 0.327
 The root mean square error is: 0.17977755
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.228
The correlation coefficient is: 0.235
 The root mean square error is: 0.28865388
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.524
The correlation coefficient is: 0.517
 The root mean square error is: 0.20530148
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.217
The correlation coefficient is: 0.17
 The root mean square error is: 0.1675093
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.963
The correlation coefficient is: 0.344
 The root mean square error is: 0.20419216
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.163
The correlation coefficient is: 0.083
 The root mean square error is: 0.13938873
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.921
The correlation coefficient is: 0.326
 The root mean square error is: 0.14858162
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.608
The correlation coefficient is: 0.411
 The root mean square error is: 0.14184028
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  -0.156
The correlation coefficient is: -0.136
 The root mean square error is: 0.18805473
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.382
The correlation coefficient is: 0.398
 The root mean square error is: 0.2234572
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]: